1   package org.apache.lucene.search.spell;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.Comparator;
22  import java.util.PriorityQueue;
23  import java.util.Queue;
24  
25  import org.apache.lucene.index.IndexReader;
26  import org.apache.lucene.index.Term;
27  import org.apache.lucene.search.spell.SuggestMode;
28  import org.apache.lucene.util.BytesRef;
29  
30  /**
31   * <p>
32   * A spell checker whose sole function is to offer suggestions by combining
33   * multiple terms into one word and/or breaking terms into multiple words.
34   * </p>
35   */
36  public class WordBreakSpellChecker {
37    private int minSuggestionFrequency = 1;
38    private int minBreakWordLength = 1;
39    private int maxCombineWordLength = 20;
40    private int maxChanges = 1;
41    private int maxEvaluations = 1000;
42    
43    /** Term that can be used to prohibit adjacent terms from being combined */
44    public static final Term SEPARATOR_TERM = new Term("", "");
45    
46    /** 
47     * Creates a new spellchecker with default configuration values
48     * @see #setMaxChanges(int)
49     * @see #setMaxCombineWordLength(int)
50     * @see #setMaxEvaluations(int)
51     * @see #setMinBreakWordLength(int)
52     * @see #setMinSuggestionFrequency(int)
53     */
54    public WordBreakSpellChecker() {}
55  
56    /**
57     * <p>
58     * Determines the order to list word break suggestions
59     * </p>
60     */
61    public enum BreakSuggestionSortMethod {
62      /**
63       * <p>
64       * Sort by Number of word breaks, then by the Sum of all the component
65       * term's frequencies
66       * </p>
67       */
68      NUM_CHANGES_THEN_SUMMED_FREQUENCY,
69      /**
70       * <p>
71       * Sort by Number of word breaks, then by the Maximum of all the component
72       * term's frequencies
73       * </p>
74       */
75      NUM_CHANGES_THEN_MAX_FREQUENCY
76    }
77    
78    /**
79     * <p>
80     * Generate suggestions by breaking the passed-in term into multiple words.
81     * The scores returned are equal to the number of word breaks needed so a
82     * lower score is generally preferred over a higher score.
83     * </p>
84     * 
85     * @param suggestMode
86     *          - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
87     * @param sortMethod
88     *          - default =
89     *          {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
90     * @return one or more arrays of words formed by breaking up the original term
91     * @throws IOException If there is a low-level I/O error.
92     */
93    public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
94        IndexReader ir, SuggestMode suggestMode,
95        BreakSuggestionSortMethod sortMethod) throws IOException {
96      if (maxSuggestions < 1) {
97        return new SuggestWord[0][0];
98      }
99      if (suggestMode == null) {
100       suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
101     }
102     if (sortMethod == null) {
103       sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
104     }
105     
106     int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
107     Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
108         : new LengthThenSumFreqComparator();
109     Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>(
110         queueInitialCapacity, queueComparator);
111     
112     int origFreq = ir.docFreq(term);
113     if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
114       return new SuggestWord[0][];
115     }
116     
117     int useMinSuggestionFrequency = minSuggestionFrequency;
118     if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
119       useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
120     }
121     
122     generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
123         useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
124         sortMethod);
125     
126     SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
127     for (int i = suggestions.size() - 1; i >= 0; i--) {
128       suggestionArray[i] = suggestions.remove().suggestWords;
129     }
130     
131     return suggestionArray;
132   }
133   
134   /**
135    * <p>
136    * Generate suggestions by combining one or more of the passed-in terms into
137    * single words. The returned {@link CombineSuggestion} contains both a
138    * {@link SuggestWord} and also an array detailing which passed-in terms were
139    * involved in creating this combination. The scores returned are equal to the
140    * number of word combinations needed, also one less than the length of the
141    * array {@link CombineSuggestion#originalTermIndexes}. Generally, a
142    * suggestion with a lower score is preferred over a higher score.
143    * </p>
144    * <p>
145    * To prevent two adjacent terms from being combined (for instance, if one is
146    * mandatory and the other is prohibited), separate the two terms with
147    * {@link WordBreakSpellChecker#SEPARATOR_TERM}
148    * </p>
149    * <p>
150    * When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
151    * suggestion will include at least one term not in the index.
152    * </p>
153    * <p>
154    * When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
155    * suggestion will have the same, or better frequency than the most-popular
156    * included term.
157    * </p>
158    * 
159    * @return an array of words generated by combining original terms
160    * @throws IOException If there is a low-level I/O error.
161    */
162   public CombineSuggestion[] suggestWordCombinations(Term[] terms,
163       int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
164       throws IOException {
165     if (maxSuggestions < 1) {
166       return new CombineSuggestion[0];
167     }
168     
169     int[] origFreqs = null;
170     if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
171       origFreqs = new int[terms.length];
172       for (int i = 0; i < terms.length; i++) {
173         origFreqs[i] = ir.docFreq(terms[i]);
174       }
175     }
176     
177     int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
178     Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
179     Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<>(
180         queueInitialCapacity, queueComparator);
181     
182     int thisTimeEvaluations = 0;
183     for (int i = 0; i < terms.length - 1; i++) {
184       if (terms[i].equals(SEPARATOR_TERM)) {
185         continue;
186       }      
187       String leftTermText = terms[i].text();
188       int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
189       if (leftTermLength > maxCombineWordLength) {
190        continue;
191       } 
192       int maxFreq = 0;
193       int minFreq = Integer.MAX_VALUE;
194       if (origFreqs != null) {
195         maxFreq = origFreqs[i];
196         minFreq = origFreqs[i];
197       } 
198       String combinedTermText = leftTermText;
199       int combinedLength = leftTermLength;
200       for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
201         if (terms[j].equals(SEPARATOR_TERM)) {
202           break;
203         }
204         String rightTermText = terms[j].text();
205         int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
206         combinedTermText += rightTermText;
207         combinedLength +=rightTermLength;
208         if (combinedLength > maxCombineWordLength) {
209           break;
210         }
211         
212         if (origFreqs != null) {
213           maxFreq = Math.max(maxFreq, origFreqs[j]);
214           minFreq = Math.min(minFreq, origFreqs[j]);
215         }
216                 
217         Term combinedTerm = new Term(terms[0].field(), combinedTermText);
218         int combinedTermFreq = ir.docFreq(combinedTerm);
219         
220         if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
221             || combinedTermFreq >= maxFreq) {
222           if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
223               || minFreq == 0) {
224             if (combinedTermFreq >= minSuggestionFrequency) {
225               int[] origIndexes = new int[j - i + 1];
226               origIndexes[0] = i;
227               for (int k = 1; k < origIndexes.length; k++) {
228                 origIndexes[k] = i + k;
229               }
230               SuggestWord word = new SuggestWord();
231               word.freq = combinedTermFreq;
232               word.score = origIndexes.length - 1;
233               word.string = combinedTerm.text();
234               CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
235                   new CombineSuggestion(word, origIndexes),
236                   (origIndexes.length - 1));
237               suggestions.offer(suggestion);
238               if (suggestions.size() > maxSuggestions) {
239                 suggestions.poll();
240               }
241             }
242           }
243         }
244         thisTimeEvaluations++;
245         if (thisTimeEvaluations == maxEvaluations) {
246           break;
247         }
248       }
249     }
250     CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
251         .size()];
252     for (int i = suggestions.size() - 1; i >= 0; i--) {
253       combineSuggestions[i] = suggestions.remove().combineSuggestion;
254     }
255     return combineSuggestions;
256   }
257   
258   private int generateBreakUpSuggestions(Term term, IndexReader ir,
259       int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
260       SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
261       int totalEvaluations, BreakSuggestionSortMethod sortMethod)
262       throws IOException {
263     String termText = term.text();
264     int termLength = termText.codePointCount(0, termText.length());
265     int useMinBreakWordLength = minBreakWordLength;
266     if (useMinBreakWordLength < 1) {
267       useMinBreakWordLength = 1;
268     }
269     if (termLength < (useMinBreakWordLength * 2)) {
270       return 0;
271     }    
272     
273     int thisTimeEvaluations = 0;
274     for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
275       int end = termText.offsetByCodePoints(0, i);
276       String leftText = termText.substring(0, end);
277       String rightText = termText.substring(end);
278       SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
279       
280       if (leftWord.freq >= useMinSuggestionFrequency) {
281         SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
282         if (rightWord.freq >= useMinSuggestionFrequency) {
283           SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
284               newSuggestion(prefix, leftWord, rightWord));
285           suggestions.offer(suggestion);
286           if (suggestions.size() > maxSuggestions) {
287             suggestions.poll();
288           }
289         }        
290         int newNumberBreaks = numberBreaks + 1;
291         if (newNumberBreaks <= maxChanges) {
292           int evaluations = generateBreakUpSuggestions(new Term(term.field(),
293               rightWord.string), ir, newNumberBreaks, maxSuggestions,
294               useMinSuggestionFrequency, newPrefix(prefix, leftWord),
295               suggestions, totalEvaluations, sortMethod);
296           totalEvaluations += evaluations;
297         }
298       }
299       
300       thisTimeEvaluations++;
301       totalEvaluations++;
302       if (totalEvaluations >= maxEvaluations) {
303         break;
304       }
305     }
306     return thisTimeEvaluations;
307   }
308   
309   private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
310     SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
311     System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
312     newPrefix[newPrefix.length - 1] = append;
313     return newPrefix;
314   }
315   
316   private SuggestWord[] newSuggestion(SuggestWord[] prefix,
317       SuggestWord append1, SuggestWord append2) {
318     SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
319     int score = prefix.length + 1;
320     for (int i = 0; i < prefix.length; i++) {
321       SuggestWord word = new SuggestWord();
322       word.string = prefix[i].string;
323       word.freq = prefix[i].freq;
324       word.score = score;
325       newSuggestion[i] = word;
326     }
327     append1.score = score;
328     append2.score = score;
329     newSuggestion[newSuggestion.length - 2] = append1;
330     newSuggestion[newSuggestion.length - 1] = append2;
331     return newSuggestion;
332   }
333   
334   private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
335     Term term = new Term(fieldname, text);
336     int freq = ir.docFreq(term);
337     SuggestWord word = new SuggestWord();
338     word.freq = freq;
339     word.score = 1;
340     word.string = text;
341     return word;
342   }
343   
344   /**
345    * Returns the minimum frequency a term must have
346    * to be part of a suggestion.
347    * @see #setMinSuggestionFrequency(int)
348    */
349   public int getMinSuggestionFrequency() {
350     return minSuggestionFrequency;
351   }
352   
353   /**
354    * Returns the maximum length of a combined suggestion
355    * @see #setMaxCombineWordLength(int)
356    */
357   public int getMaxCombineWordLength() {
358     return maxCombineWordLength;
359   }
360   
361   /**
362    * Returns the minimum size of a broken word
363    * @see #setMinBreakWordLength(int)
364    */
365   public int getMinBreakWordLength() {
366     return minBreakWordLength;
367   }
368   
369   /**
370    * Returns the maximum number of changes to perform on the input
371    * @see #setMaxChanges(int)
372    */
373   public int getMaxChanges() {
374     return maxChanges;
375   }
376   
377   /**
378    * Returns the maximum number of word combinations to evaluate.
379    * @see #setMaxEvaluations(int)
380    */
381   public int getMaxEvaluations() {
382     return maxEvaluations;
383   }
384   
385   /**
386    * <p>
387    * The minimum frequency a term must have to be included as part of a
388    * suggestion. Default=1 Not applicable when used with
389    * {@link SuggestMode#SUGGEST_MORE_POPULAR}
390    * </p>
391    * 
392    * @see #getMinSuggestionFrequency()
393    */
394   public void setMinSuggestionFrequency(int minSuggestionFrequency) {
395     this.minSuggestionFrequency = minSuggestionFrequency;
396   }
397   
398   /**
399    * <p>
400    * The maximum length of a suggestion made by combining 1 or more original
401    * terms. Default=20
402    * </p>
403    * 
404    * @see #getMaxCombineWordLength()
405    */
406   public void setMaxCombineWordLength(int maxCombineWordLength) {
407     this.maxCombineWordLength = maxCombineWordLength;
408   }
409   
410   /**
411    * <p>
412    * The minimum length to break words down to. Default=1
413    * </p>
414    * 
415    * @see #getMinBreakWordLength()
416    */
417   public void setMinBreakWordLength(int minBreakWordLength) {
418     this.minBreakWordLength = minBreakWordLength;
419   }
420   
421   /**
422    * <p>
423    * The maximum numbers of changes (word breaks or combinations) to make on the
424    * original term(s). Default=1
425    * </p>
426    * 
427    * @see #getMaxChanges()
428    */
429   public void setMaxChanges(int maxChanges) {
430     this.maxChanges = maxChanges;
431   }
432   
433   /**
434    * <p>
435    * The maximum number of word combinations to evaluate. Default=1000. A higher
436    * value might improve result quality. A lower value might improve
437    * performance.
438    * </p>
439    * 
440    * @see #getMaxEvaluations()
441    */
442   public void setMaxEvaluations(int maxEvaluations) {
443     this.maxEvaluations = maxEvaluations;
444   }
445   
446   private class LengthThenMaxFreqComparator implements
447       Comparator<SuggestWordArrayWrapper> {
448     @Override
449     public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
450       if (o1.suggestWords.length != o2.suggestWords.length) {
451         return o2.suggestWords.length - o1.suggestWords.length;
452       }
453       if (o1.freqMax != o2.freqMax) {
454         return o1.freqMax - o2.freqMax;
455       }
456       return 0;
457     }
458   }
459   
460   private class LengthThenSumFreqComparator implements
461       Comparator<SuggestWordArrayWrapper> {
462     @Override
463     public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
464       if (o1.suggestWords.length != o2.suggestWords.length) {
465         return o2.suggestWords.length - o1.suggestWords.length;
466       }
467       if (o1.freqSum != o2.freqSum) {
468         return o1.freqSum - o2.freqSum;
469       }
470       return 0;
471     }
472   }
473   
474   private class CombinationsThenFreqComparator implements
475       Comparator<CombineSuggestionWrapper> {
476     @Override
477     public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
478       if (o1.numCombinations != o2.numCombinations) {
479         return o2.numCombinations - o1.numCombinations;
480       }
481       if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
482         return o1.combineSuggestion.suggestion.freq
483             - o2.combineSuggestion.suggestion.freq;
484       }
485       return 0;
486     }
487   }
488   
489   private class SuggestWordArrayWrapper {
490     final SuggestWord[] suggestWords;
491     final int freqMax;
492     final int freqSum;
493     
494     SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
495       this.suggestWords = suggestWords;
496       int aFreqSum = 0;
497       int aFreqMax = 0;
498       for (SuggestWord sw : suggestWords) {
499         aFreqSum += sw.freq;
500         aFreqMax = Math.max(aFreqMax, sw.freq);
501       }
502       this.freqSum = aFreqSum;
503       this.freqMax = aFreqMax;
504     }
505   }
506   
507   private class CombineSuggestionWrapper {
508     final CombineSuggestion combineSuggestion;
509     final int numCombinations;
510     
511     CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
512         int numCombinations) {
513       this.combineSuggestion = combineSuggestion;
514       this.numCombinations = numCombinations;
515     }
516   }
517 }